{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "92debf81",
   "metadata": {},
   "source": [
    "### Filtering criteria for deletion gene fusions \n",
    "\n",
    "* Overlap of misexpressed gene - 5'end only \n",
    "* Overlapping gene on same strand as misexpressed gene\n",
    "* Overlapping gene expresssed \n",
    "* Overlapping gene only partially deleted "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5e515d14",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from pybedtools import BedTool\n",
    "from io import StringIO\n",
    "import pysam\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a5a46a27",
   "metadata": {},
   "outputs": [],
   "source": [
    "# constants \n",
    "tpm_cutoff = 0.5\n",
    "# variables \n",
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "# paths \n",
    "misexp_vrnt_feat_path = wkdir_path.joinpath(\"6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv\")\n",
    "gencode_bed_path = wkdir_path.joinpath(\"3_misexp_genes/bed_files/all_genes.bed\")\n",
    "tpm_mtx_path = wkdir_path.joinpath(\"1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv\")\n",
    "# constants \n",
    "tpm_cutoff = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c6072b57",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of misexpression-associated deletions: 87\n"
     ]
    }
   ],
   "source": [
    "# load misexpression-associated variants \n",
    "misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep=\"\\t\")\n",
    "# list of misexpression-associated DELs \n",
    "misexp_del_feat_df = misexp_vrnt_feat_df[misexp_vrnt_feat_df.SVTYPE == \"DEL\"]\n",
    "misexp_dels = misexp_del_feat_df.vrnt_id.unique()\n",
    "print(f\"Number of misexpression-associated deletions: {len(misexp_dels)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "621c4971",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load Gencode genes .bed file \n",
    "gencode_bed = BedTool(gencode_bed_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0b13755c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of genes with median TPM > 0.5: 17418\n"
     ]
    }
   ],
   "source": [
    "# only include overlapping genes that are transcribed (median TPM > 0.5)\n",
    "# load gene expression matrix \n",
    "tpm_mtx_df = pd.read_csv(tpm_mtx_path)\n",
    "# add median expression of gene across INTERVAL \n",
    "median_tpm_df = pd.DataFrame(tpm_mtx_df.set_index(\"gene_id\").median(axis=1))\n",
    "median_tpm_df = median_tpm_df.rename(columns={0:\"median_tpm\"}).reset_index()\n",
    "genes_median_tpm05 = median_tpm_df[median_tpm_df.median_tpm > tpm_cutoff].gene_id.unique()\n",
    "print(f\"Number of genes with median TPM > {tpm_cutoff}: {len(genes_median_tpm05)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a2b7e116",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of DELs overlapping 5' end of misexpressed gene: 7\n",
      "DELs overlapping 5' end of misexpressed gene: DEL_chr4_13529219_13608506, 284739, DEL_chr9_135511950_135523352, 143283, 173184, 240913, DEL_chr20_63291972_63297412\n",
      "Percengate overlap 5' end of misexpressed gene: 8.045977011494253\n"
     ]
    }
   ],
   "source": [
    "### select misexpressed variant-gene pairs with partial 5' overlap\n",
    "misexp_del_5prime_overlap_df = misexp_del_feat_df[misexp_del_feat_df.position == \"Partial overlap 5' end\"]\n",
    "misexp_del_5prime_overlap = misexp_del_5prime_overlap_df.vrnt_id.unique()\n",
    "print(f\"Number of DELs overlapping 5' end of misexpressed gene: {len(misexp_del_5prime_overlap)}\")\n",
    "print(f\"DELs overlapping 5' end of misexpressed gene: {', '.join(misexp_del_5prime_overlap)}\")\n",
    "print(f\"Percengate overlap 5' end of misexpressed gene: {len(misexp_del_5prime_overlap)/len(misexp_dels) * 100}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a8a7110a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load variants in tested windows \n",
    "vrnt_id_in_window_chr_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed\")\n",
    "vrnt_id_in_window_chr_bed = BedTool(vrnt_id_in_window_chr_path)\n",
    "# intersect SVs and genes \n",
    "sv_overlap_gene_str = StringIO(str(vrnt_id_in_window_chr_bed.intersect(gencode_bed, wo=True)))\n",
    "# read intersect between SVs and genes \n",
    "columns={0: \"sv_chrom\", 1: \"sv_start\", 2: \"sv_end\", 3:\"vrnt_id\", \n",
    "         4:\"gene_chrom\", 5:\"overlap_gene_start\", 6:\"overlap_gene_end\", 7:\"overlap_gene_id\", \n",
    "         8:\"score\", 9:\"overlap_gene_strand\", 10:\"gene_overlap\"\n",
    "        }\n",
    "sv_overlap_gene_df = pd.read_csv(sv_overlap_gene_str, sep=\"\\t\", header=None, dtype={3:str}).rename(columns=columns)\n",
    "# subset to relevant features \n",
    "sv_overlap_gene_feat_df = sv_overlap_gene_df[[\"vrnt_id\", \"overlap_gene_start\", \"overlap_gene_end\", \"overlap_gene_id\", \"overlap_gene_strand\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "fa389ea7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add overlapping genes to misexpression DELs that overlap 5' end of misexpressed gene  \n",
    "misexp_del_5prime_overlap_df = pd.merge(misexp_del_5prime_overlap_df, \n",
    "                                       sv_overlap_gene_feat_df, \n",
    "                                       on=\"vrnt_id\", \n",
    "                                       how=\"left\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8cf2b9a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def overlaps_gene_3_prime(row): \n",
    "    if row[\"overlap_gene_strand\"] == \"+\": \n",
    "        return (row[\"sv_start\"] < row[\"overlap_gene_end\"]) & (row[\"sv_start\"] > row[\"overlap_gene_start\"]) & (row[\"sv_end\"] > row[\"overlap_gene_end\"])\n",
    "    elif row[\"overlap_gene_strand\"] == \"-\": \n",
    "        return (row[\"sv_start\"] < row[\"overlap_gene_start\"]) & (row[\"overlap_gene_end\"] > row[\"sv_end\"]) & (row[\"overlap_gene_start\"] < row[\"sv_end\"])\n",
    "    else: \n",
    "        return np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "93cfc290",
   "metadata": {},
   "outputs": [],
   "source": [
    "# annotate genes that overlap the 3' end \n",
    "misexp_del_5prime_overlap_df[\"overlap_gene_3_prime\"] = misexp_del_5prime_overlap_df.apply(overlaps_gene_3_prime, axis=1)\n",
    "# variant overlaps gene that is expressed in blood \n",
    "misexp_del_5prime_overlap_df[\"overlap_gene_expressed\"] = np.where(misexp_del_5prime_overlap_df.overlap_gene_id.isin(genes_median_tpm05), True, False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c5949b57",
   "metadata": {},
   "outputs": [],
   "source": [
    "# transcript fusion variant prioritisation \n",
    "misexp_del_fusion_candidates_df = misexp_del_5prime_overlap_df[(misexp_del_5prime_overlap_df.overlap_gene_strand == misexp_del_5prime_overlap_df.gene_strand) & \n",
    "                                                               (misexp_del_5prime_overlap_df.overlap_gene_3_prime) & \n",
    "                                                               (misexp_del_5prime_overlap_df.overlap_gene_expressed)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b96869d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of DEL transcript fusion candidates: 3\n",
      "Tx fusion candidate DELs:\n",
      " - DEL_chr4_13529219_13608506\n",
      " - 284739\n",
      " - 143283\n"
     ]
    }
   ],
   "source": [
    "misexp_del_fusion_candidates = misexp_del_fusion_candidates_df.vrnt_id.unique()\n",
    "print(f\"Number of DEL transcript fusion candidates: {len(misexp_del_fusion_candidates)}\")\n",
    "print(f\"Tx fusion candidate DELs:\")\n",
    "for vrnt_id in misexp_del_fusion_candidates: \n",
    "    print(f\" - {vrnt_id}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
